true

Setting up how many cores to use for analysis and fold-enrichment vs freqency

# cores to use for multiprocessing: is the number of cores is small -> we are not on a cluster -> use all; if the number of cores is large -> we are on cluster -> use only 15 or 7 or 31 cores (n requested-1)
if(detectCores() <= 4) cores_to_use = detectCores() - 1
if(detectCores() > 4) cores_to_use = 6
cores_to_use = NULL

# how many permutations?
N_permut = 1000

filename = paste0("./processed_data_files/predict_domain_human_clust",gsub("-","",Sys.Date()),".RData")
filename = paste0("./processed_data_files/predict_domain_human_clust",gsub("-","","2018-08-17"),".RData")

How many of the domains identified using our approach are in the ELM database (as compared to the population of all domains we tested)?

Calculate empirical pvalues using count of a domain among human interacting partners of each human protein

file_all = "./processed_data_files/human_net_w_domains"
gunzip(paste0(file_all,".gz"), remove = F)
data_all = fread(file_all, sep = "\t", stringsAsFactors = F)
data_all = data_all[IDs_interactor_human_B != "UNKNOWN"]
unlink(file_all)

file_BioPlex3 = "./processed_data_files/BioPlex3human_net_w_domains"
gunzip(paste0(file_BioPlex3,".gz"), remove = F)
data_BioPlex3 = fread(file_BioPlex3, sep = "\t", stringsAsFactors = F)
data_BioPlex3 = data_BioPlex3[IDs_interactor_human_B != "UNKNOWN"]
unlink(file_BioPlex3)

# count function: set up standard parameters
permuteCount = function(data, select_nodes = NULL, also_permuteYZ = F, N_permut = 10){
    permutationPval(interactions2permute = IDs_interactor_human_A ~ IDs_interactor_human_B, # first set of interacting pairs (XY) that are to be permuted
                    associations2test = IDs_interactor_human_A ~ IDs_domain_human_B, # set of interacting pairs to be tested (XZ), YZ interactions are assumed
                    node_attr = list(IDs_interactor_human_A ~ IDs_interactor_human_A_degree,
                                     IDs_domain_human_B ~ domain_count,
                                     IDs_interactor_human_A + IDs_domain_human_B ~ domain_frequency_per_IDs_interactor_human_A),
                    data = data,
                    statistic = IDs_interactor_human_A + IDs_domain_human_B ~ .N,
                    select_nodes = select_nodes,
                    N = N_permut,
                    cores = NULL, seed = 2, also_permuteYZ = F,
                    clustermq = T, clustermq_mem = 12500,
                    split_comp_inner_N = 3, clustermq_jobs = 60,
                    clustermq_log_worker = F)
}
# count: all protein interactions in IntAct and all domains - # permute IDs_interactor_human_A ~ IDs_interactor_human_B
load(filename)
if(!"res_count_all" %in% objects()){
    time = proc.time()
    res_count_all = permuteCount(data_all, select_nodes = IDs_domain_human_B ~ domain_count >= 1,
                                 N_permut = 1800)
    proc.time() - time
    # save results
    save(list = ls(), file=filename)
}
plot(res_count_all, main = "count: all proteins and domains")

# count: protein interactions in BioPlex3 and all domains - # permute IDs_interactor_human_A ~ IDs_interactor_human_B
if(!"res_count_all" %in% objects()){
    time = proc.time()
    res_count_BioPlex3 = permuteCount(data_BioPlex3, select_nodes = IDs_domain_human_B ~ domain_count >= 1,
                                      N_permut = 40)
    proc.time() - time
    # save results
    save(list = ls(), file=filename)
}
plot(res_count_BioPlex3, main = "count: all proteins and domains (BioPlex)")

Human protein degree and the background domain count of top-scoring proteins

# count
PermutResult2D(res = res_count_all, N = 250) +
    ggtitle("2D-bin plots of 250 top-scoring human protein - domain pairs, \n statistic: count of a domain among interacting partners of a protein (IntAct)")

# Fisher test p-value
PermutResult2D(res = res_count_BioPlex3, N = 250) + 
    ggtitle("2D-bin plots of 250 top-scoring human protein - domain pairs, \n statistic: count of a domain among interacting partners of a protein (BioPlex3)")

Map domains known to interact with linear motifs from ELM to the domains we found

if(!file.exists("./data_files/interactiondomains.tsv")) downloader::download("http://elm.eu.org/interactiondomains.tsv","./data_files/interactiondomains.tsv")
interactiondomains = fread("./data_files/interactiondomains.tsv")
interactiondomains[, pfam_id := `Interaction Domain Id`]

domains_known = interactiondomains[, unique(pfam_id)]

InterProScan_domains = readInterProGFF3("../viral_project/processed_data_files/all_human_viral_protein_domains.gff3.gz")
# get InterProID to member database ID mapping
InterPro2memberDB = getInterPro2memberDB(InterProScan_domains)
InterPro2memberDB = InterPro2memberDB[complete.cases(InterPro2memberDB)]
domains_known_mapped = unique(InterPro2memberDB[memberDBID %in% domains_known | InterProID %in% domains_known, InterProID])
domains_not_mapped = unique(domains_known[!(domains_known %in% InterPro2memberDB$memberDBID | domains_known %in% InterPro2memberDB$InterProID)])

I did Fisher test to evaluate if the domains that we find are enriched in domains known to interact with linear motifs (from ELM). I have picked most likely human protein - human domain associations (by p-value). Then I counted how many known motif-binding domains we have found and did the Fisher test. Finally, I was choosing different p-value cutoffs.

testEnrichment = function(N, res, rank_by = "p.value", domains_known_mapped, random = F, name = "", decreasing = F){
    if(random) {
        res$data_pval = unique(res$data_with_pval[,c("IDs_interactor_human_A", "IDs_domain_human_B", rank_by, "domain_type", "domain_count", "IDs_interactor_human_A_degree"), with = F])
        domains_found = res$data_pval[sample(1:nrow(res$data_with_pval), N), unique(IDs_domain_human_B)]
    } else {
        res$data_pval = unique(res$data_with_pval[,c("IDs_interactor_human_A", "IDs_domain_human_B", rank_by, "domain_type", "domain_count", "IDs_interactor_human_A_degree"), with = F])
        ind = order(unlist(res$data_pval[, c(rank_by), with = F]), decreasing = decreasing)[1:N] 
        domains_found = res$data_pval[ind, unique(IDs_domain_human_B)]
    }
    
    alldomains = res$data_with_pval[, unique(IDs_domain_human_B)]
    known = factor(alldomains %in% domains_known_mapped, levels = c("TRUE", "FALSE"))
    found = factor(alldomains %in% domains_found, levels = c("TRUE", "FALSE"))
    table_res = table(known, found)
    
    test = fisher.test(table(known, found), alternative = "greater", conf.int = T)
    
    return(c(pval = test$p.value, odds_ratio = as.vector(test$estimate),
             count = table_res["TRUE", "TRUE"], total_count = sum(as.logical(found)),
             name = name))
}
runningTestEnrichment = function(res, name, rank_by = "p.value", decreasing = F, Ns){
    enrichment = sapply(Ns, testEnrichment, res = res, 
                        domains_known_mapped = domains_known_mapped, 
                        name = name, rank_by = rank_by, decreasing = decreasing)
    colnames(enrichment) = Ns
    return(enrichment)
}

Ns = seq(25, 1000, 40)
# count
enrichment = runningTestEnrichment(res_count_all, name = "domain frequency among interactors of a human protein, empirical pval", Ns = Ns)
enrichment_justfreq = runningTestEnrichment(res_count_all, rank_by = "observed_statistic", name = "domain count among interactors of a human protein, frequency", Ns = Ns) 
enrichmentBioPlex3 = runningTestEnrichment(res_count_BioPlex3, name = "domain frequency among interactors of a human protein (BioPlex3), empirical pval", Ns = Ns)

random_domains = function(N = 100, seed = seed, Ns = seq(25, 500, 25), res){
    set.seed(seed)
    
    quantiles = c(0.975, 0.75, 0.5, 0.25, 0.025)
    quantile_names = c("97.5% quantile", "75% quantile", "median", "25% quantile", "2.5% quantile")
    
    all_temp = Q(fun = function(i) {
        library(data.table)
        enrichmentRANDOM = sapply(Ns, testEnrichment, res = res, domains_known_mapped = domains_known_mapped, random = T, name = "N random proteins")[1:3,]
        rownames = rownames(enrichmentRANDOM)
        enrichmentRANDOM = matrix(as.numeric(unlist(enrichmentRANDOM)),nr=nrow(enrichmentRANDOM))
        colnames(enrichmentRANDOM) = Ns
        rownames(enrichmentRANDOM) = rownames
        enrichmentRANDOM
    }, 1:N,
    export = list(Ns = Ns, testEnrichment = testEnrichment,
                  res = res, domains_known_mapped = domains_known_mapped),
    seed = seed, memory = 1000, n_jobs = 20, rettype = "list")
    
    #pval_temp = replicate(N, {
    #    enrichmentRANDOM = sapply(Ns, testEnrichment, res = res, domains_known_mapped = domains_known_mapped, random = T, name = "N random proteins")[1,]
    #    names(enrichmentRANDOM) = Ns
    #    as.numeric(enrichmentRANDOM)
    #})
    pval_temp = sapply(all_temp, function(m) m[1,])
    pval = apply(pval_temp, 1, quantile, probs = quantiles)
    rownames(pval) = quantile_names
    colnames(pval) = Ns
    
    #odds_ratio_temp = replicate(N, {
    #    enrichmentRANDOM = sapply(Ns, testEnrichment, res = res, domains_known_mapped = domains_known_mapped, random = T, name = "N random proteins")[2,]
    #    names(enrichmentRANDOM) = Ns
    #    as.numeric(enrichmentRANDOM)
    #})
    odds_ratio_temp = sapply(all_temp, function(m) m[2,])
    odds_ratio = apply(odds_ratio_temp, 1, quantile, probs = quantiles)
    rownames(odds_ratio) = quantile_names
    colnames(odds_ratio) = Ns
    
    #count_temp = replicate(N, {
    #    enrichmentRANDOM = sapply(Ns, testEnrichment, res = res, domains_known_mapped = domains_known_mapped, random = T, name = "N random proteins")[3,]
    #    names(enrichmentRANDOM) = Ns
    #    as.numeric(enrichmentRANDOM)
    #})
    count_temp = sapply(all_temp, function(m) m[3,])
    count = apply(count_temp, 1, quantile, probs = quantiles)
    rownames(count) = quantile_names
    colnames(count) = Ns
    
    return(list(pval = pval, odds_ratio = odds_ratio, count = count))
}

if(!"enrichmentRANDOM" %in% objects()){
    enrichmentRANDOM = random_domains(1000, 1, res = res_count_all)
    # save results
    save(list = ls(), file=filename)
}
## Connecting ebi via SSH ...
## Sending common data ...
## Submitting 20 worker jobs (ID: 6531) ...
## Running 1,000 calculations (1 calls/chunk) ...
## Master: [437.7s 0.8% CPU]; Worker: [avg 99.1% CPU, max 568.4 Mb]

As we include more proteins, the number of known domains we find increases and then levels off (probably because some of the known SLIM-binding domains do not interact with viral proteins). ????? is this true for human? should always increase

plotEnrichment(enrichment, enrichment_justfreq, enrichmentBioPlex3,
               random_domains = enrichmentRANDOM, 
               domains_known_mapped = domains_known_mapped, type = "count", plot_type = "l")

As we include more proteins, the Fisher test odds ratio decreases (we add more stuff that is not known). Odds ratio measures how much more likely are we to find a domain using our procedure if it’s a known domain as compared to if it’s not a known domain.

plotEnrichment(enrichment, enrichment_justfreq, enrichmentBioPlex3,
               random_domains = enrichmentRANDOM, 
               domains_known_mapped = domains_known_mapped, type = "odds_ratio", plot_type = "l")

corresponding P-values from the Fisher test

plotEnrichment(enrichment, enrichment_justfreq, enrichmentBioPlex3,
               random_domains = enrichmentRANDOM, 
               domains_known_mapped = domains_known_mapped, type = "pval", plot_type = "l")

How many human proteins are known per each of the domain instances in top 200 protein-domain pairs?

selectTopHits = function(res, N){
    res$data_pval = unique(res$data_with_pval[,.(IDs_interactor_human_A, IDs_domain_human_B, p.value, domain_type, domain_count, IDs_interactor_human_A_degree)])
    pairs200pval = res$data_pval[order(p.value, decreasing = F)[1:N], max(p.value)]
    
    restop100 = res
    restop100$data_with_pval = restop100$data_with_pval[p.value <= pairs200pval, ]
    restop100$data_with_pval[, N_human_per_human_w_domain := length(unique(IDs_interactor_human_A)), by = .(IDs_interactor_human_B, IDs_domain_human_B)]
    return(restop100)
}
restop100 = selectTopHits(res_count_all, N = 250)
plot(restop100, IDs_interactor_human_B ~ N_human_per_human_w_domain)

plot(restop100)

1122 human proteins with enriched domains have 5 or more human interacting partners.
476 human proteins with enriched domains have 10 or more human interacting partners.

what are those domains? are they known ELM-interacting domains? which proteins they are in?

restop100$data_with_pval[N_human_per_human_w_domain >= 10, unique(IDs_domain_human_B)]
##  [1] "IPR013083" "IPR013087" "IPR000608" "IPR016135" "IPR027417"
##  [6] "IPR001766" "IPR011991" "IPR000504" "IPR000980" "IPR001452"
## [11] "IPR001811" "IPR000719" "IPR011009" "IPR001841" "IPR011993"
## [16] "IPR003309" "IPR008916" "IPR032444" "IPR029071" "IPR001245"
## [21] "IPR020635" "IPR033899" "IPR004827" "IPR013783" "IPR027409"
## [26] "IPR027413" "IPR023410" "IPR003599" "IPR007110" "IPR013151"
## [31] "IPR015943" "IPR017986"
restop100$data_with_pval[N_human_per_human_w_domain >= 10, unique(IDs_domain_human_B)] %in% domains_known_mapped
##  [1] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE FALSE
## [12]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE
## [23] FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE  TRUE
restop100$data_with_pval[N_human_per_human_w_domain >= 10 & IDs_domain_human_B == "IPR000504", unique(IDs_interactor_human_B)]
##  [1] "O00425" "O14979" "O43390" "O60506" "O60812" "O75494" "O95758"
##  [8] "P05455" "P07910" "P08621" "P09012" "P09651" "P11940" "P14866"
## [15] "P19338" "P22626" "P23246" "P26368" "P26599" "P31483" "P31942"
## [22] "P31943" "P35637" "P38159" "P43243" "P51991" "P52272" "P52597"
## [29] "P55795" "P62995" "P84103" "P98175" "P98179" "Q01081" "Q01130"
## [36] "Q01844" "Q07955" "Q08170" "Q13151" "Q13242" "Q13243" "Q13247"
## [43] "Q13283" "Q13310" "Q13595" "Q14011" "Q14103" "Q14498" "Q15233"
## [50] "Q15287" "Q15415" "Q15427" "Q15717" "Q16629" "Q16630" "Q86V81"
## [57] "Q96PK6" "Q99729" "Q9BWF3" "Q9NR30" "Q9NZI8" "Q9UBU9" "Q9UHX1"
## [64] "Q9UKA9" "Q9UKM9" "Q9UKV3" "Q9Y5S9"
DT::datatable(restop100$data_with_pval[N_human_per_human_w_domain >= 10,])
## Warning in instance$preRenderHook(instance): It seems your data is too
## big for client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html

R session information

filename = paste0("./processed_data_files/predict_domain_human_clust",gsub("-","",Sys.Date()),".RData")
filename = paste0("./processed_data_files/predict_domain_human_clust",gsub("-","","2017-10-19"),".RData")
save(list = ls(), file=filename)
#R.utils::gzip(filename = "./processed_data_files/predict_domain_human_clust11092017.RData",
#              destname = "./processed_data_files/predict_domain_human_clust11092017.RData.gz",
#              remove = T, overwrite = T)
Sys.Date()
## [1] "2018-08-17"
sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Sierra 10.12.6
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] clustermq_0.8.4.1    R.utils_2.6.0        R.oo_1.22.0         
##  [4] R.methodsS3_1.7.1    RColorBrewer_1.1-2   GGally_1.4.0        
##  [7] ggplot2_3.0.0        rtracklayer_1.40.4   GenomicRanges_1.32.6
## [10] GenomeInfoDb_1.16.0  MItools_0.1.38       Biostrings_2.48.0   
## [13] XVector_0.20.0       data.table_1.11.4    PSICQUIC_1.18.1     
## [16] plyr_1.8.4           httr_1.3.1           biomaRt_2.36.1      
## [19] IRanges_2.14.10      S4Vectors_0.18.3     BiocGenerics_0.26.0 
## 
## loaded via a namespace (and not attached):
##  [1] bitops_1.0-6                matrixStats_0.54.0         
##  [3] bit64_0.9-7                 progress_1.2.0             
##  [5] rprojroot_1.3-2             tools_3.5.1                
##  [7] backports_1.1.2             R6_2.2.2                   
##  [9] DT_0.4                      KernSmooth_2.23-15         
## [11] DBI_1.0.0                   lazyeval_0.2.1             
## [13] colorspace_1.3-2            withr_2.1.2                
## [15] tidyselect_0.2.4            prettyunits_1.0.2          
## [17] ontologyIndex_2.4           bit_1.1-14                 
## [19] compiler_3.5.1              Biobase_2.40.0             
## [21] DelayedArray_0.6.4          caTools_1.17.1.1           
## [23] scales_1.0.0                stringr_1.3.1              
## [25] digest_0.6.15               Rsamtools_1.32.2           
## [27] rmarkdown_1.10              pkgconfig_2.0.1            
## [29] htmltools_0.3.6             htmlwidgets_1.2            
## [31] rlang_0.2.1                 RSQLite_2.1.1              
## [33] shiny_1.1.0                 bindr_0.1.1                
## [35] jsonlite_1.5                crosstalk_1.0.0            
## [37] BiocParallel_1.14.2         gtools_3.8.1               
## [39] dplyr_0.7.6                 RCurl_1.95-4.11            
## [41] magrittr_1.5                GenomeInfoDbData_1.1.0     
## [43] Matrix_1.2-14               Rcpp_0.12.18               
## [45] munsell_0.5.0               proto_1.0.0                
## [47] stringi_1.2.4               yaml_2.2.0                 
## [49] SummarizedExperiment_1.10.1 zlibbioc_1.26.0            
## [51] gplots_3.0.1                qvalue_2.12.0              
## [53] grid_3.5.1                  blob_1.1.1                 
## [55] promises_1.0.1              gdata_2.18.0               
## [57] crayon_1.3.4                lattice_0.20-35            
## [59] splines_3.5.1               hms_0.4.2                  
## [61] knitr_1.20                  pillar_1.3.0               
## [63] reshape2_1.4.3              XML_3.98-1.15              
## [65] glue_1.3.0                  evaluate_0.11              
## [67] downloader_0.4              httpuv_1.4.5               
## [69] gtable_0.2.0                purrr_0.2.5                
## [71] reshape_0.8.7               assertthat_0.2.0           
## [73] gsubfn_0.7                  mime_0.5                   
## [75] xtable_1.8-2                later_0.7.3                
## [77] tibble_1.4.2                GenomicAlignments_1.16.0   
## [79] AnnotationDbi_1.42.1        memoise_1.1.0              
## [81] bindrcpp_0.2.2              rzmq_0.9.3                 
## [83] ROCR_1.0-7